# Clear memory
rm(list = ls())

# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  data.table, janitor, magrittr, fixest, 
  broom, tidyverse, tidylog
)
options("tidylog.display" = NULL)
`%notin%` = Negate(`%in%`)

##############################################
## CO, NO2, SO2, O3: DO NOT REQUIRE MULTI-YEAR AVERAGING
## CO: 2nd highest value
## NO2: annual mean
## SO2: 2nd highest value and annual mean
## O3: 2nd highest value
##############################################

# load data containing the observations of pollution in terms of the standards for each
monitor_df = read_csv("raw/EPA/annual_conc_by_monitor_1997.csv") %>% 
    clean_names() %>% 
    filter(parameter_name %in% c("Carbon monoxide",
                                 "Nitrogen dioxide (NO2)", 
                                 "Ozone", 
                                 "Sulfur dioxide"))

monitor_df = monitor_df %>%
    filter(pollutant_standard %in% sort(unique(monitor_df$pollutant_standard))[c(1,2,3,4,5,8,9,10,11,12)])

# keep only the worst monitor reading of the year across all monitors in a county
county_df = monitor_df %>% 
    group_by(state_code, county_code, parameter_code, parameter_name, pollutant_standard) %>% 
    summarise(across(arithmetic_mean:x10th_percentile, ~ max(.x, na.rm = TRUE))) %>% 
    ungroup()

# keep only the relevant variables
county_df = county_df %>% 
    select(state_code, county_code, parameter_name, pollutant_standard,
           arithmetic_mean, x1st_max_value, x2nd_max_value, x3rd_max_value, x4th_max_value,
           x99th_percentile, x98th_percentile, x95th_percentile, x90th_percentile) %>% 
    mutate(parameter_name =
               case_when(
                  parameter_name == "Carbon monoxide" ~ "co",
                  parameter_name == "Nitrogen dioxide (NO2)" ~ "no2",
                  parameter_name == "Ozone" ~ "o3",
                  parameter_name == "Sulfur dioxide" ~ "so2"
               )) %>% 
    mutate(pollutant_standard =
               case_when(
                   pollutant_standard == "CO 1-hour 1971" ~ "co_1_hour",
                   pollutant_standard == "CO 8-hour 1971" ~ "co_8_hour",
                   pollutant_standard == "NO2 Annual 1971" ~ "no2_annual",
                   pollutant_standard == "NO2 1-hour 2010" ~ "no2_1_hour",
                   pollutant_standard == "SO2 24-hour 1971" ~ "so2_24_hour",
                   pollutant_standard == "SO2 3-hour 1971" ~ "so2_3_hour",
                   pollutant_standard == "SO2 1-hour 2010" ~ "so2_1_hour",
                   pollutant_standard == "SO2 Annual 1971" ~ "so2_annual",
                   pollutant_standard == "Ozone 1-hour 1979" ~ "o3_1_hour",
                   pollutant_standard == "Ozone 8-hour 2015" ~ "o3_8_hour"
               )) %>% 
    pivot_wider(
        names_from = pollutant_standard,
        values_from = arithmetic_mean:x90th_percentile,
        names_sep = "_"
    ) %>% 
    select(state_code, county_code, 
           co_1h = x2nd_max_value_co_1_hour, # 35ppm 1 hour 9ppm 8 hour X
           co_8h = x2nd_max_value_co_8_hour, # 35ppm 1 hour 9ppm 8 hour X
           no2_annual = arithmetic_mean_no2_annual, # .14ppm 24 hour (still missing secondary 3-hour standard)
           no2_1h = x98th_percentile_no2_1_hour, # .14ppm 24 hour (still missing secondary 3-hour standard)
           o3_1h = x2nd_max_value_o3_1_hour, # .12ppm 1 hour X
           o3_8h = x4th_max_value_o3_8_hour,
           so2_annual = arithmetic_mean_so2_annual, # .02ppm annual
           so2_24h = x2nd_max_value_so2_24_hour,
           so2_3h = x2nd_max_value_so2_3_hour,
           so2_1h = x99th_percentile_so2_1_hour,
           ) %>% 
    mutate(fips = paste0(state_code, county_code)) %>% 
    relocate(fips) %>% 
    select(-state_code, -county_code)

##############################################
## PM10: 3 year averaging
##############################################

# load data containing the observations of pollution in terms of the standards for each
monitor_df = bind_rows(
    read_csv("raw/EPA/annual_conc_by_monitor_1997.csv"),
    read_csv("raw/EPA/annual_conc_by_monitor_1996.csv"),
    read_csv("raw/EPA/annual_conc_by_monitor_1995.csv")
    ) %>% 
    clean_names() %>% 
    filter(parameter_name %in% c("PM10 Total 0-10um STP"))

monitor_df = monitor_df %>% 
    filter(pollutant_standard %in% sort(unique(monitor_df$pollutant_standard))[c(1)])

# keep only the worst monitor reading of the year across all monitors in a state
pm_df = monitor_df %>% 
    group_by(state_code, county_code, parameter_code, parameter_name, pollutant_standard, year) %>% 
    summarise(across(arithmetic_mean:x10th_percentile, ~ max(.x, na.rm = T))) %>% 
    ungroup() %>% 
  filter(!is.infinite(x2nd_max_value))

# keep only the relevant variables
pm_df = pm_df %>% 
    select(state_code, county_code, year, parameter_name, pollutant_standard,
           arithmetic_mean, x1st_max_value, x2nd_max_value, x3rd_max_value, x4th_max_value,
           x99th_percentile, x98th_percentile, x95th_percentile, x90th_percentile) %>% 
    mutate(parameter_name =
               case_when(
                   parameter_name == "PM10 Total 0-10um STP" ~ "pm10",
               )) %>% 
    mutate(pollutant_standard =
               case_when(
                   pollutant_standard == "PM10 24-hour 2006" ~ "pm10_24_hour"
               )) %>% 
    pivot_wider(
        names_from = pollutant_standard,
        values_from = arithmetic_mean:x90th_percentile,
        names_sep = "_"
    ) %>% 
    select(state_code, county_code, year,
           x2nd_max_value_pm10_24_hour)

pm_df = pm_df %>% 
    mutate(fips = paste0(state_code, county_code)) %>% 
    select(-state_code, -county_code) %>% 
    group_by(fips) %>% 
    summarise(
        pm10_24h = mean(x2nd_max_value_pm10_24_hour, na.rm = T)
    ) %>% 
    ungroup() 

county_df = full_join(county_df, pm_df)

county_df[is.na(county_df)] = 0

crosswalk = read_csv("data/simulation_fips_crosswalk.csv")

county_df = crosswalk %>% 
  left_join(county_df)

county_df[is.na(county_df)] = 0

county_df = county_df %>% 
  group_by(fips) %>% 
  summarise(across(everything(), ~ max(.x, na.rm = T)))

county_df = county_df %>% 
  mutate(
    co_1h_std = 35,
    co_8h_std = 9,
    no2_annual_std = 53,
    o3_1h_std = .12,
    so2_annual_std = 20,
    so2_3h_std = 500,
    so2_24h_std = 140,
    pm10_24h_std = 150,
    
    co_1h_std_2022 = 35,
    co_8h_std_2022 = 9,
    no2_annual_std_2022 = 53,
    no2_1h_std_2022 = 100,
    o3_8h_std_2022 = .07,
    so2_3h_std_2022 = 500,
    so2_1h_std_2022 = 75,
    pm10_24h_std_2022 = 150
  )

fwrite(county_df, "data/concentrations_criteria_1997.csv")